import scanpy as sc, anndata as ad, numpy as np, pandas as pd
from scipy import sparse
from anndata import AnnData
import warnings
import socket
from matplotlib import pylab
import yaml
import os
warnings.filterwarnings('ignore')
import ipynbname
try:
nb_fname = ipynbname.name()
except:
nb_fname = "".join(os.path.basename(globals()["__vsc_ipynb_file__"]).split(".")[:-1])
%matplotlib inline
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=50, facecolor='white', dpi_save=500)
pylab.rcParams['figure.figsize'] = (9, 9)
scanpy==1.8.1 anndata==0.7.6 umap==0.4.6 numpy==1.20.2 scipy==1.6.3 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.13.1 python-igraph==0.9.8 louvain==0.7.1 pynndescent==0.5.5
hostRoot = "-".join(socket.gethostname().split('-')[0:2])
#indir=paths["paths"]["indir"][hostRoot]
outdir="./outdir"
#projectBaseDir=paths["paths"]["projectBaseDir"][hostRoot]
adata = sc.read_h5ad(outdir+"/1_polaroid_mint.h5ad")
sc.pl.violin(adata, 'total_counts',
groupby= "type", jitter=0.4, multi_panel=True, rotation=90)
# Since distirbutions are really close we will use araw average of dbls prior and cut the joint adata
putDBLs = round(adata.obs.groupby(["type"]).size()/1000*.9).mean(axis = 0)
putDBLs = round(adata.shape[0]/100*putDBLs)
putDBLs = adata.obs.total_counts.sort_values()[-putDBLs:].index
adata = adata[~adata.obs_names.isin(putDBLs)]
#runAdata
sc.pl.violin(adata, 'total_counts',
groupby= "type", jitter=0.4, multi_panel=True, rotation=90)
sc.pl.violin(adata, ['pct_counts_mt','pct_counts_ribo'],
groupby= "dataset", jitter=0.4, multi_panel=True, rotation=90)
adata = adata[adata.obs.pct_counts_ribo < 40, :]
adata = adata[adata.obs.pct_counts_mt < 5, :]
sc.pl.violin(adata, ['pct_counts_mt','pct_counts_ribo'],
groupby= "dataset", jitter=0.4, multi_panel=True, rotation=90)
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
Trying to set attribute `.obs` of view, copying.
filtered out 5171 genes that are detected in less than 3 cells
adata
AnnData object with n_obs × n_vars = 22220 × 28371
obs: 'dataset', 'organoid', 'region', 'type', 'type_region', 'regionContrast', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'n_genes'
var: 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells'
uns: 'dataset_colors', 'organoid_colors', 'regionContrast_colors', 'region_colors', 'type_colors'
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
normalizing counts per cell
finished (0:00:00)
Here I would compute HVGs:
Vertical intersection = set A
-polaroid 1,2,3 > proximal (intersection)
-polaroid 1,2,3 > medial (intersection)
-polaroid 1,2,3 > distal (intersection)
-ctl 1,2,3 > P1 (intersection)
-ctl 1,2,3 > P2 (intersection)
-ctl 1,2,3 > P3 (intersection)
Union of all 6
Horizontal intersection = set B
polaroid Proximal (1,2,3) vs polaroid medial (1,2,3) > 9 possible combinations (intersection)
polaroid Medial (1,2,3) vs polaroid distal (1,2,3) > 9 possible combinations (intersection)
ctl piece 1 (1,2,3) vs ctl piece 2 (1,2,3) > 9 possible combinations (intersection)
ctl piece 2 (1,2,3) vs ctl piece 3 (1,2,3) > 9 possible combinations (intersection)
Union of all 4
Finally (A).union(B)
adata.obs.type_region.unique()
['control_piece2', 'control_piece1', 'control_piece3', 'polaroid_medial', 'polaroid_distal', 'polaroid_proximal'] Categories (6, object): ['control_piece2', 'control_piece1', 'control_piece3', 'polaroid_medial', 'polaroid_distal', 'polaroid_proximal']
# Given the epxloratory phase as many genes were retained
VERTICAL_HVGs = {}
for group in adata.obs.type_region.unique():
adata_group = adata[adata.obs.type_region == group].copy()
HVGsDF = sc.pp.highly_variable_genes(adata_group, min_mean=0.0125, max_mean=3, min_disp=0.5, inplace=False, batch_key="organoid")
VERTICAL_HVGs[group] = set(HVGsDF[HVGsDF["highly_variable_nbatches"] == 3].index)
VERTICAL_HVGs = set.union(*list(VERTICAL_HVGs.values()))
extracting highly variable genes
finished (0:00:01)
extracting highly variable genes
finished (0:00:00)
extracting highly variable genes
finished (0:00:00)
extracting highly variable genes
finished (0:00:00)
extracting highly variable genes
finished (0:00:00)
extracting highly variable genes
finished (0:00:01)
len(VERTICAL_HVGs)
3521
import itertools
# Setting up contrasts
proximal = ["polaroid1_proximal","polaroid2_proximal","polaroid3_proximal"]
medial = ["polaroid1_medial","polaroid2_medial","polaroid3_medial"]
distal = ["polaroid1_distal","polaroid2_distal","polaroid3_distal"]
p1 = ["control1_piece1","control2_piece1","control3_piece1"]
p2 = ["control1_piece2","control2_piece2","control3_piece2"]
p3 = ["control1_piece3","control2_piece3","control3_piece3"]
proximal_vs_medial = list(itertools.product(proximal, medial))
medial_vs_distal = list(itertools.product(medial, distal))
p1_vs_p2 = list(itertools.product(p1, p2))
p2_vs_p3 = list(itertools.product(p2, p3))
HORIZONTAL_HVGs = {}
# Proximal vs distal regions
# Proximal vs distal regions
# Proximal vs distal regions
proximal_vs_medial_HVGs = {}
for contrast in proximal_vs_medial:
adataContrast = adata[adata.obs["dataset"].isin(list(contrast))].copy()
print(adataContrast.obs.dataset.value_counts())
sc.pp.highly_variable_genes(adataContrast, min_mean=0.0125, max_mean=3, min_disp=0.5)
HVGs = set(adataContrast.var[adataContrast.var.highly_variable].index.tolist())
proximal_vs_medial_HVGs["_".join(contrast)] = HVGs
proximal_vs_medial_HVGs = pd.Series(list(itertools.chain.from_iterable([list(i) for i in list(proximal_vs_medial_HVGs.values())])))
proximal_vs_medial_HVGs = set(proximal_vs_medial_HVGs.value_counts()[proximal_vs_medial_HVGs.value_counts() == 9].index.tolist())
HORIZONTAL_HVGs["proximal_vs_medial_HVGs"] = proximal_vs_medial_HVGs
# medial vs distal regions
# medial vs distal regions
# medial vs distal regions
medial_vs_distal_HVGs = {}
for contrast in medial_vs_distal:
adataContrast = adata[adata.obs["dataset"].isin(list(contrast))].copy()
print(adataContrast.obs.dataset.value_counts())
sc.pp.highly_variable_genes(adataContrast, min_mean=0.0125, max_mean=3, min_disp=0.5)
HVGs = set(adataContrast.var[adataContrast.var.highly_variable].index.tolist())
medial_vs_distal_HVGs["_".join(contrast)] = HVGs
medial_vs_distal_HVGs = pd.Series(list(itertools.chain.from_iterable([list(i) for i in list(medial_vs_distal_HVGs.values())])))
medial_vs_distal_HVGs = set(medial_vs_distal_HVGs.value_counts()[medial_vs_distal_HVGs.value_counts() == 9].index.tolist())
HORIZONTAL_HVGs["medial_vs_distal_HVGs"] = medial_vs_distal_HVGs
# P1 vs P2
# P1 vs P2
# P1 vs P2
p1_vs_p2_HVGs = {}
for contrast in p1_vs_p2:
adataContrast = adata[adata.obs["dataset"].isin(list(contrast))].copy()
print(adataContrast.obs.dataset.value_counts())
sc.pp.highly_variable_genes(adataContrast, min_mean=0.0125, max_mean=3, min_disp=0.5)
HVGs = set(adataContrast.var[adataContrast.var.highly_variable].index.tolist())
p1_vs_p2_HVGs["_".join(contrast)] = HVGs
p1_vs_p2_HVGs = pd.Series(list(itertools.chain.from_iterable([list(i) for i in list(p1_vs_p2_HVGs.values())])))
p1_vs_p2_HVGs = set(p1_vs_p2_HVGs.value_counts()[p1_vs_p2_HVGs.value_counts() == 9].index.tolist())
HORIZONTAL_HVGs["p1_vs_p2_HVGs"] = p1_vs_p2_HVGs
# P2 vs P3
# P2 vs P3
# P2 vs P3
p2_vs_p3_HVGs = {}
for contrast in p2_vs_p3:
adataContrast = adata[adata.obs["dataset"].isin(list(contrast))].copy()
print(adataContrast.obs.dataset.value_counts())
sc.pp.highly_variable_genes(adataContrast, min_mean=0.0125, max_mean=3, min_disp=0.5)
HVGs = set(adataContrast.var[adataContrast.var.highly_variable].index.tolist())
p2_vs_p3_HVGs["_".join(contrast)] = HVGs
p2_vs_p3_HVGs = pd.Series(list(itertools.chain.from_iterable([list(i) for i in list(p2_vs_p3_HVGs.values())])))
p2_vs_p3_HVGs = set(p2_vs_p3_HVGs.value_counts()[p2_vs_p3_HVGs.value_counts() == 9].index.tolist())
HORIZONTAL_HVGs["p2_vs_p3_HVGs"] = p2_vs_p3_HVGs
polaroid1_proximal 2410
polaroid1_medial 1938
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid1_proximal 2410
polaroid2_medial 733
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid1_proximal 2410
polaroid3_medial 675
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid2_proximal 2396
polaroid1_medial 1938
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid2_proximal 2396
polaroid2_medial 733
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid2_proximal 2396
polaroid3_medial 675
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid3_proximal 2026
polaroid1_medial 1938
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid3_proximal 2026
polaroid2_medial 733
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid3_proximal 2026
polaroid3_medial 675
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid1_medial 1938
polaroid1_distal 1165
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid1_medial 1938
polaroid2_distal 306
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid1_medial 1938
polaroid3_distal 250
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid1_distal 1165
polaroid2_medial 733
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid2_medial 733
polaroid2_distal 306
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid2_medial 733
polaroid3_distal 250
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid1_distal 1165
polaroid3_medial 675
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid3_medial 675
polaroid2_distal 306
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid3_medial 675
polaroid3_distal 250
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control1_piece2 607
control1_piece1 376
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control2_piece2 1339
control1_piece1 376
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control3_piece2 1552
control1_piece1 376
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control2_piece1 1259
control1_piece2 607
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control2_piece2 1339
control2_piece1 1259
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control3_piece2 1552
control2_piece1 1259
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control3_piece1 1429
control1_piece2 607
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control3_piece1 1429
control2_piece2 1339
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control3_piece2 1552
control3_piece1 1429
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control1_piece3 625
control1_piece2 607
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control2_piece3 1175
control1_piece2 607
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control3_piece3 1959
control1_piece2 607
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control2_piece2 1339
control1_piece3 625
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control2_piece2 1339
control2_piece3 1175
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control3_piece3 1959
control2_piece2 1339
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control3_piece2 1552
control1_piece3 625
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control3_piece2 1552
control2_piece3 1175
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control3_piece3 1959
control3_piece2 1552
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
HORIZONTAL_HVGs = set.union(*list(HORIZONTAL_HVGs.values()))
JointHVGs = list(HORIZONTAL_HVGs.union(VERTICAL_HVGs))
adata.var["highly_variable"] = adata.var_names.isin(JointHVGs)
adata.raw = adata.copy()
sc.pp.scale(adata, max_value=20)
... as `zero_center=True`, sparse input is densified and may lead to large memory consumption
sc.tl.pca(adata, svd_solver='arpack', use_highly_variable=True)
computing PCA
on highly variable genes
with n_comps=50
finished (0:00:06)
sc.pl.pca(adata, color=["organoid","type","dataset",], size = 30, add_outline = True,outline_width=(0.2, 0.05))
sc.pl.pca_variance_ratio(adata, log=True)
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=20)
computing neighbors
using 'X_pca' with n_pcs = 20
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:10)
sc.tl.umap(adata)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:00:13)
sc.tl.diffmap(adata)
computing Diffusion Maps using n_comps=15(=n_dcs)
computing transitions
finished (0:00:00)
eigenvalues of transition matrix
[1. 0.9980042 0.9973144 0.99366105 0.9917391 0.99092036
0.9891531 0.9869507 0.9857839 0.9838741 0.98252034 0.98203415
0.98055375 0.9791002 0.97739536]
finished: added
'X_diffmap', diffmap coordinates (adata.obsm)
'diffmap_evals', eigenvalues of transition matrix (adata.uns) (0:00:00)
sc.pl.umap(adata, color=["organoid","type","dataset",], size = 30, add_outline = True,outline_width=(0.2, 0.05))
sc.pl.diffmap(adata, color=["organoid","type","dataset",], size = 30, add_outline = True,outline_width=(0.2, 0.05))
sc.tl.draw_graph(adata, n_jobs=4)
drawing single-cell graph using layout 'fa'
finished: added
'X_draw_graph_fa', graph_drawing coordinates (adata.obsm) (0:02:33)
sc.pl.draw_graph(adata, color=["organoid","type","dataset","MKI67"], size = 30, add_outline = True,outline_width=(0.2, 0.05))
adata.write_h5ad(outdir+"/02A_Preprocess.h5ad")
adata = sc.read_h5ad(outdir+"/02A_Preprocess.h5ad")
print(adata.obsp["connectivities"].data)
print(adata.obsp["distances"])
[0.15580252 0.13195293 1. ... 0.13278878 0.075919725 0.11208251 ] (0, 1048) 12.065792083740234 (0, 1769) 9.329296112060547 (0, 1821) 11.881159782409668 (0, 3178) 12.285615921020508 (0, 4778) 9.072836875915527 (0, 5245) 12.623744010925293 (0, 5631) 12.39741039276123 (0, 6303) 8.042698860168457 (0, 6690) 10.714126586914062 (0, 7250) 10.81225872039795 (0, 8621) 10.401694297790527 (0, 8724) 9.853194236755371 (0, 9113) 8.979679107666016 (0, 9193) 10.61874008178711 (0, 9288) 11.086877822875977 (0, 10187) 12.657814025878906 (0, 10188) 10.93688678741455 (0, 10257) 7.547016620635986 (0, 11823) 11.791207313537598 (1, 1141) 9.481132507324219 (1, 1298) 8.799959182739258 (1, 1474) 9.789313316345215 (1, 1498) 9.047750473022461 (1, 1767) 9.519059181213379 (1, 1818) 9.610590934753418 : : (22218, 21114) 9.191518783569336 (22218, 21671) 5.599649429321289 (22218, 21816) 8.500561714172363 (22218, 21835) 8.81576919555664 (22218, 21966) 6.717822074890137 (22218, 22048) 6.033876419067383 (22219, 1477) 6.691205978393555 (22219, 3935) 8.230859756469727 (22219, 4782) 8.194168090820312 (22219, 7293) 6.904054641723633 (22219, 10451) 7.492819786071777 (22219, 10618) 8.102606773376465 (22219, 10852) 5.850067615509033 (22219, 11314) 5.759396553039551 (22219, 11894) 6.0283989906311035 (22219, 13500) 7.062030792236328 (22219, 13614) 5.425055503845215 (22219, 13749) 8.004842758178711 (22219, 14162) 8.029681205749512 (22219, 14329) 7.746777057647705 (22219, 14752) 5.958281517028809 (22219, 14821) 5.19552755355835 (22219, 15186) 6.956639289855957 (22219, 15278) 5.219976902008057 (22219, 15317) 8.238070487976074
import anndata2ri
import rpy2.rinterface_lib.callbacks
import logging
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
anndata2ri.activate()
adata_no_knn_all = adata.copy()
adata_no_knn_all.obsp = None
adata_no_knn_all.uns.pop("neighbors")
HVGsList = adata_no_knn_all.var[adata_no_knn_all.var["highly_variable"]].index.tolist()
len(HVGsList)
3536
%load_ext rpy2.ipython
%%R
library(gruffi)
library(MarkdownReports)
%%R -i adata_no_knn_all -i HVGsList
adata_no_knn_all_SOBJ <- as.Seurat(adata_no_knn_all, counts = "X", data = "X")
adata_no_knn_all_SOBJ@reductions$pca <- adata_no_knn_all_SOBJ@reductions$PCA
VariableFeatures(adata_no_knn_all_SOBJ) <- HVGsList
adata_no_knn_all_SOBJ <- FindNeighbors(adata_no_knn_all_SOBJ, dims = 1:50,features = HVGsList)
adata_no_knn_all_SOBJ <- Seurat.utils::SetupReductionsNtoKdimensions(obj = adata_no_knn_all_SOBJ, nPCs = 50, dimensions=3:2, reduction="umap")
adata_no_knn_all_SOBJ <- aut.res.clustering(obj = adata_no_knn_all_SOBJ, assay="originalexp" )
adata_no_knn_all_SOBJ@assays$RNA <- adata_no_knn_all_SOBJ@assays$originalexp
[1] "3 dimensional umap is calculated" [1] "2 dimensional umap is calculated" [1] "Current clustering resolution: 2" [1] "Current median cluster size: 532" [1] "Search between 2 and 22." [1] "Suggested resolution is 12 and the respective clustering is stored in Idents(obj) and in @meta.data. The median numbers of cells per cluster is 171.5. The number of clusters is 128"
%%R
ensembl <- biomaRt::useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl", version="107")
go1 <- "GO:0006096" # Glycolysis
go2 <- "GO:0034976" # ER-stress
go3 <- "GO:0042063" # Gliogenesis, negative filtering
%%R
granule.res.4.gruffi <- adata_no_knn_all_SOBJ@misc$gruffi$'optimal.granule.res'
adata_no_knn_all_SOBJ <- reassign.small.clusters(adata_no_knn_all_SOBJ, ident = granule.res.4.gruffi) # will be stored in meta data column as "seurat_clusters.reassigned"
granule.res.4.gruffi <- paste0(granule.res.4.gruffi, '.reassigned')
[1] "1 clusters have < 30 cells, which need to be reassigned." [1] "THE 3D UMAP IN @misc$'reductions.backup WILL BE USED." [1] "cluster: 127" [1] "originalexp_snn_res.12.reassigned" [1] "Call plot.clust.size.distr() to check results."
%%R
adata_no_knn_all_SOBJ <- GO_score_evaluation(obj = adata_no_knn_all_SOBJ, GO_term = go1, save.UMAP = TRUE, new_GO_term_computation = T, clustering = granule.res.4.gruffi, plot.each.gene = F)
# ER stress GO:0034976
adata_no_knn_all_SOBJ <- GO_score_evaluation(obj = adata_no_knn_all_SOBJ, GO_term = go2, save.UMAP = TRUE, new_GO_term_computation = T, clustering = granule.res.4.gruffi, plot.each.gene = F)
# Gliogenesis GO:0042063
adata_no_knn_all_SOBJ <- GO_score_evaluation(obj = adata_no_knn_all_SOBJ, GO_term = go3, save.UMAP = TRUE, new_GO_term_computation = T, clustering = granule.res.4.gruffi, plot.each.gene = F)
[1] "Score.GO.0006096" [1] "For GO score computation assay is set to RNA. It will be reset to Seurat::DefaultAssay() afterwards." [1] "Score.GO.0006096 not found. Need to call GetGOTerms(). set only.draw.plot = FALSE." [1] "GENE SCORE WILL NOW BE SET / OVERWRITTEN" [1] "GetGOTerms()" [1] "83 Gene symbols downloaded: PKLR GPI SLC2A6 APP EP300 HTR2A LDHA PGK2 PFKL DHTKD1" [1] "83 Gene symbols downloaded: PKLR GPI SLC2A6 APP EP300 HTR2A LDHA PGK2 PFKL DHTKD1" [1] "IntersectWithExpressed()" [1] "4 genes (of 83 ) are MISSING from the Seurat object: PGK2 PRXL2C IFNG TREX1" [1] "Genes in GO:0006096 are saved under obj@misc$GO$ GO.0006096" [1] "ENO1" "MTOR" "PRKAA2" "PGM1" "ARNT" "PKLR" [1] "AddGOScore()" [1] "Trailing '1' in metadata column name is removed. Column name: Score.GO.0006096" [1] "OutDir not defined !!! Saving in working directory." [1] "/group/testa/Users/davide.castaldi/Polaroid_publication/Notebooks_release_2/FeaturePlot.Score.GO.0006096..png" [1] "Calculating cl. average score" [1] "obj split by originalexp_snn_res.12.reassigned" [1] "originalexp_snn_res.12.reassigned_cl.av_GO:0006096" [1] "Score.GO.0034976" [1] "For GO score computation assay is set to RNA. It will be reset to Seurat::DefaultAssay() afterwards." [1] "Score.GO.0034976 not found. Need to call GetGOTerms(). set only.draw.plot = FALSE." [1] "GENE SCORE WILL NOW BE SET / OVERWRITTEN" [1] "GetGOTerms()" [1] "259 Gene symbols downloaded: HSPA1A DERL3 ALOX5 BAG6 RNF5 BFAR ATF6B FLOT1 HYOU1 SERP2" [1] "259 Gene symbols downloaded: HSPA1A DERL3 ALOX5 BAG6 RNF5 BFAR ATF6B FLOT1 HYOU1 SERP2" [1] "IntersectWithExpressed()" [1] "7 genes (of 259 ) are MISSING from the Seurat object: MARCHF6 PDX1 MAGEA3 NIBAN1 CERT1 RNF186 CXCL8" [1] "Genes in GO:0034976 are saved under obj@misc$GO$ GO.0034976" [1] "UBE2J2" "PARK7" "UBE4B" "FBXO2" "FBXO44" "FBXO6" [1] "AddGOScore()" [1] "Trailing '1' in metadata column name is removed. Column name: Score.GO.0034976" [1] "OutDir not defined !!! Saving in working directory." [1] "/group/testa/Users/davide.castaldi/Polaroid_publication/Notebooks_release_2/FeaturePlot.Score.GO.0034976..png" [1] "Calculating cl. average score" [1] "obj split by originalexp_snn_res.12.reassigned" [1] "originalexp_snn_res.12.reassigned_cl.av_GO:0034976" [1] "Score.GO.0042063" [1] "For GO score computation assay is set to RNA. It will be reset to Seurat::DefaultAssay() afterwards." [1] "Score.GO.0042063 not found. Need to call GetGOTerms(). set only.draw.plot = FALSE." [1] "GENE SCORE WILL NOW BE SET / OVERWRITTEN" [1] "GetGOTerms()" [1] "314 Gene symbols downloaded: MBOAT7 DLL1 MAPT AZU1 ARHGEF10 CCL3 TNF AGER WASF3 NKX2-2" [1] "314 Gene symbols downloaded: MBOAT7 DLL1 MAPT AZU1 ARHGEF10 CCL3 TNF AGER WASF3 NKX2-2" [1] "IntersectWithExpressed()" [1] "7 genes (of 314 ) are MISSING from the Seurat object: CCL3 PALS1 IFNG BMERB1 C1QA S100A9 S100A8" [1] "Genes in GO:0042063 are saved under obj@misc$GO$ GO.0042063" [1] "MXRA8" "SKI" "HES5" "TP73" "GPR157" "MTOR" [1] "AddGOScore()" [1] "Trailing '1' in metadata column name is removed. Column name: Score.GO.0042063" [1] "OutDir not defined !!! Saving in working directory." [1] "/group/testa/Users/davide.castaldi/Polaroid_publication/Notebooks_release_2/FeaturePlot.Score.GO.0042063..png" [1] "Calculating cl. average score" [1] "obj split by originalexp_snn_res.12.reassigned" [1] "originalexp_snn_res.12.reassigned_cl.av_GO:0042063"
%%R
(i1 <- Stringendo::kppu(granule.res.4.gruffi, 'cl.av', go1))
(i2 <- Stringendo::kppu(granule.res.4.gruffi, 'cl.av', go2))
(i3 <- Stringendo::kppu(granule.res.4.gruffi, 'cl.av', go3))
[1] "originalexp_snn_res.12.reassigned_cl.av_GO:0042063"
%%R
Shiny.GO.thresh <- function(obj = combined.obj
, proposed.method = c("fitted","empirical")[1]
, quantile = c(.99,.9)[1]
, stress.ident1 = paste0(Seurat.utils::GetClusteringRuns(obj)[1],"_cl.av_GO:0006096")
, stress.ident2 = paste0(Seurat.utils::GetClusteringRuns(obj)[1],"_cl.av_GO:0034976")
, notstress.ident3 = paste0(Seurat.utils::GetClusteringRuns(obj)[1],"_cl.av_GO:0042063")
, notstress.ident4 = NULL
, plot.cluster.shiny = Seurat.utils::GetClusteringRuns(obj)[1]
, call.shiny.app = TRUE) {
app_env <- new.env()
meta <- obj@meta.data
if(!is.null(stress.ident1)) stopifnot(stress.ident1 %in% colnames(meta))
if(!is.null(stress.ident2)) stopifnot(stress.ident2 %in% colnames(meta))
if(!is.null(notstress.ident3)) stopifnot(notstress.ident3 %in% colnames(meta))
if(!is.null(notstress.ident4)) stopifnot(notstress.ident4 %in% colnames(meta))
av.stress.ident1 <- as.numeric(levels(meta[,stress.ident1]))
av.stress.ident2 <- as.numeric(levels(meta[,stress.ident2]))
av.notstress.ident3 <- as.numeric(levels(meta[,notstress.ident3]))
av.notstress.ident4 <- as.numeric(levels(meta[,notstress.ident4]))
if(call.shiny.app) {
# compute proposals for thresholds
app_env$thresh.stress.ident1 <- PlotNormAndSkew(av.stress.ident1, q = quantile, tresholding = proposed.method, plot.hist = F)
app_env$thresh.stress.ident2 <- PlotNormAndSkew(av.stress.ident2, q = quantile, tresholding = proposed.method, plot.hist = F)
app_env$thresh.notstress.ident3 <- PlotNormAndSkew(av.notstress.ident3, q = quantile, tresholding = proposed.method, plot.hist = F)
app_env$thresh.notstress.ident4 <- PlotNormAndSkew(av.notstress.ident4, q = quantile, tresholding = proposed.method, plot.hist = F)
min.x.stress.ident1 <- floor(min(av.stress.ident1, app_env$thresh.stress.ident1))
max.x.stress.ident1 <- ceiling(max(av.stress.ident1, app_env$thresh.stress.ident1))
step.stress.ident1 <- 0.001
min.x.stress.ident2 <- floor(min(av.stress.ident2, app_env$thresh.stress.ident2))
max.x.stress.ident2 <- ceiling(max(av.stress.ident2, app_env$thresh.stress.ident2))
step.stress.ident2 <- 0.001
min.x.notstress.ident3 <- floor(min(av.notstress.ident3, app_env$thresh.notstress.ident3))
max.x.notstress.ident3 <- ceiling(max(av.notstress.ident3, app_env$thresh.notstress.ident3))
step.notstress.ident3 <- 0.001
min.x.notstress.ident4 <- floor(min(av.notstress.ident4, app_env$thresh.notstress.ident4))
max.x.notstress.ident4 <- ceiling(max(av.notstress.ident4, app_env$thresh.notstress.ident4))
step.notstress.ident4 <- 0.001
app_env$idents <- list(stress.ident1 = stress.ident1,
stress.ident2 = stress.ident2,
notstress.ident3 = notstress.ident3,
notstress.ident4 = notstress.ident4)
app_env$sliders <- list(min.x.stress.ident1 = min.x.stress.ident1, max.x.stress.ident1 = max.x.stress.ident1, step.stress.ident1 = step.stress.ident1,
min.x.stress.ident2 = min.x.stress.ident2, max.x.stress.ident2 = max.x.stress.ident2, step.stress.ident2 = step.stress.ident2,
min.x.notstress.ident3 = min.x.notstress.ident3, max.x.notstress.ident3 = max.x.notstress.ident3, step.notstress.ident3 = step.notstress.ident3,
min.x.notstress.ident4 = min.x.notstress.ident4, max.x.notstress.ident4 = max.x.notstress.ident4, step.notstress.ident4 = step.notstress.ident4)
app_env$average.vec <- list(av.stress.ident1 = av.stress.ident1,
av.stress.ident2 = av.stress.ident2,
av.notstress.ident3 = av.notstress.ident3,
av.notstress.ident4 = av.notstress.ident4)
app_env$obj <- obj
app_env$plot.cluster.shiny <- plot.cluster.shiny
app_dir <- system.file("shiny", "GO.thresh", package = "gruffi")
app_ui <- source(file.path(app_dir, "ui.R"), local=new.env(parent=app_env),
echo=FALSE, keep.source=TRUE)$value
app_server <- source(file.path(app_dir, "server.R"), local=new.env(parent=app_env),
echo=FALSE, keep.source=TRUE)$value
obj <- shiny::runApp(shiny::shinyApp(app_ui, app_server))
} else {
# compute proposals for thresholds
thresh.stress.ident1 <- PlotNormAndSkew(av.stress.ident1, q = quantile, tresholding = proposed.method, plot.hist = F)
thresh.stress.ident2 <- PlotNormAndSkew(av.stress.ident2, q = quantile, tresholding = proposed.method, plot.hist = F)
thresh.notstress.ident3 <- PlotNormAndSkew(av.notstress.ident3, q = quantile, tresholding = proposed.method, plot.hist = F)
thresh.notstress.ident4 <- PlotNormAndSkew(av.notstress.ident4, q = quantile, tresholding = proposed.method, plot.hist = F)
if(!is.null(stress.ident1)) {
obj@misc$gruffi$thresh.stress.ident1 <- thresh.stress.ident1
obj$stress.ident1.thresh_cluster <- as.numeric(levels(obj@meta.data[,stress.ident1]))[obj@meta.data[,stress.ident1]] > thresh.stress.ident1
obj$stress.ident1.thresh_cluster[obj$stress.ident1.thresh_cluster == FALSE] <- F
obj$stress.ident1.thresh_cluster[obj$stress.ident1.thresh_cluster == TRUE] <- T
}
if(!is.null(stress.ident2)) {
obj@misc$gruffi$thresh.stress.ident2 <- thresh.stress.ident2
obj$stress.ident2.thresh_cluster <- as.numeric(levels(obj@meta.data[,stress.ident2]))[obj@meta.data[,stress.ident2]] > thresh.stress.ident2
obj$stress.ident2.thresh_cluster[obj$stress.ident2.thresh_cluster == FALSE] <- F
obj$stress.ident2.thresh_cluster[obj$stress.ident2.thresh_cluster == TRUE] <- T
}
if(!is.null(notstress.ident3)) {
obj@misc$gruffi$thresh.notstress.ident3 <- thresh.notstress.ident3
obj$notstress.ident3.thresh_cluster <- as.numeric(levels(obj@meta.data[,notstress.ident3]))[obj@meta.data[,notstress.ident3]] > thresh.notstress.ident3
obj$notstress.ident3.thresh_cluster[obj$notstress.ident3.thresh_cluster == FALSE] <- F
obj$notstress.ident3.thresh_cluster[obj$notstress.ident3.thresh_cluster == TRUE] <- T
}
if(!is.null(notstress.ident4)) {
obj@misc$gruffi$thresh.notstress.ident4 <- thresh.notstress.ident4
obj$notstress.ident4.thresh_cluster <- as.numeric(levels(obj@meta.data[,notstress.ident4]))[obj@meta.data[,notstress.ident4]] > thresh.notstress.ident4
obj$notstress.ident4.thresh_cluster[obj$notstress.ident4.thresh_cluster == FALSE] <- F
obj$notstress.ident4.thresh_cluster[obj$notstress.ident4.thresh_cluster == TRUE] <- T
}
if(!is.null(stress.ident1)) {
i1.bool <- as.numeric(levels(obj@meta.data[,stress.ident1]))[obj@meta.data[,stress.ident1]] > thresh.stress.ident1
if(!is.null(stress.ident2)) {
i2.bool <- as.numeric(levels(obj@meta.data[,stress.ident2]))[obj@meta.data[,stress.ident2]] > thresh.stress.ident2
stress.bool <- i1.bool | i2.bool
}
else {
stress.bool <- i1.bool
}
} else {
if(!is.null(stress.ident2)) {
i2.bool <- as.numeric(levels(obj@meta.data[,stress.ident2]))[obj@meta.data[,stress.ident2]] > thresh.stress.ident2
stress.bool <- i2.bool
}
}
notstress.bool <- NULL
if(!is.null(notstress.ident3)) {
i3.bool <- as.numeric(levels(obj@meta.data[,notstress.ident3]))[obj@meta.data[,notstress.ident3]] > thresh.notstress.ident3
if(!is.null(notstress.ident4)) {
i4.bool <- as.numeric(levels(obj@meta.data[,notstress.ident4]))[obj@meta.data[,notstress.ident4]] > thresh.notstress.ident4
notstress.bool <- i3.bool | i4.bool
}
else {
notstress.bool <- i3.bool
}
} else {
if(!is.null(notstress.ident4)) {
i4.bool <- as.numeric(levels(obj@meta.data[,notstress.ident4]))[obj@meta.data[,notstress.ident4]] > thresh.notstress.ident4
notstress.bool <- i4.bool
}
}
if(!is.null(notstress.bool)) {
obj$is.Stressed <- stress.bool & !notstress.bool
} else {
obj$is.Stressed <- stress.bool
}
ptlist <- list()
count <- 1
if(!is.null(stress.ident1)) {
ptlist[[count]] <- Seurat::FeaturePlot(obj = obj, features = ww.convert.GO_term.2.score(paste0(substr(x = strsplit(stress.ident1, "cl.av")[[1]][2], 2, nchar(strsplit(stress.ident1, "cl.av")[[1]][2])))), min.cutoff = "q01", max.cutoff = "q99")+Seurat::NoLegend()+Seurat::NoAxes()
count <- count + 1
}
if(!is.null(stress.ident2)) {
ptlist[[count]] <- Seurat::FeaturePlot(obj = obj, features = ww.convert.GO_term.2.score(paste0(substr(x = strsplit(stress.ident2, "cl.av")[[1]][2], 2, nchar(strsplit(stress.ident2, "cl.av")[[1]][2])))), min.cutoff = "q01", max.cutoff = "q99")+Seurat::NoLegend()+Seurat::NoAxes()
count <- count + 1
}
if(!is.null(notstress.ident3)) {
ptlist[[count]] <- Seurat::FeaturePlot(obj = obj, features = ww.convert.GO_term.2.score(paste0(substr(x = strsplit(notstress.ident3, "cl.av")[[1]][2], 2, nchar(strsplit(notstress.ident3, "cl.av")[[1]][2])))), min.cutoff = "q01", max.cutoff = "q99")+Seurat::NoLegend()+Seurat::NoAxes()
count <- count + 1
}
if(!is.null(notstress.ident4)) {
ptlist[[count]] <- Seurat::FeaturePlot(obj = obj, features = ww.convert.GO_term.2.score(paste0(substr(x = strsplit(notstress.ident4, "cl.av")[[1]][2], 2, nchar(strsplit(notstress.ident4, "cl.av")[[1]][2])))), min.cutoff = "q01", max.cutoff = "q99")+Seurat::NoLegend()+Seurat::NoAxes()
count <- count + 1
}
if(!is.null(stress.ident1)) {
ptlist[[count]] <- Seurat.utils::clUMAP(obj = obj, ident = "stress.ident1.thresh_cluster", save.plot = F, aspect.ratio = 1) +
ggplot2::ggtitle(ggplot2::element_blank()) +
ggplot2::scale_color_manual(values=c(Seurat.utils::gg_color_hue(2)[2], Seurat.utils::gg_color_hue(2)[1]))+ Seurat::NoAxes()
count <- count + 1
}
if(!is.null(stress.ident2)) {
ptlist[[count]] <- Seurat.utils::clUMAP(obj = obj, ident = "stress.ident2.thresh_cluster", save.plot = F, aspect.ratio = 1) +
ggplot2::ggtitle(ggplot2::element_blank()) +
ggplot2::scale_color_manual(values=c(Seurat.utils::gg_color_hue(2)[2], Seurat.utils::gg_color_hue(2)[1]))+ Seurat::NoAxes()
count <- count + 1
}
if(!is.null(notstress.ident3)) {
ptlist[[count]] <- Seurat.utils::clUMAP(obj = obj, ident = "notstress.ident3.thresh_cluster", save.plot = F, aspect.ratio = 1) +
ggplot2::ggtitle(ggplot2::element_blank()) +
ggplot2::scale_color_manual(values=c(Seurat.utils::gg_color_hue(2)[1], Seurat.utils::gg_color_hue(2)[2]))+ Seurat::NoAxes()
count <- count + 1
}
if(!is.null(notstress.ident4)) {
ptlist[[count]] <- Seurat.utils::clUMAP(obj = obj, ident = "notstress.ident4.thresh_cluster", save.plot = F, aspect.ratio = 1) +
ggplot2::ggtitle(ggplot2::element_blank()) +
ggplot2::scale_color_manual(values=c(Seurat.utils::gg_color_hue(2)[1], Seurat.utils::gg_color_hue(2)[2]))+ Seurat::NoAxes()
count <- count + 1
}
message("The Shiny interface was not called, please check the proposed thresholds manually! GO term specific UMAPs will be plotted.")
gridExtra::grid.arrange(grobs=ptlist,ncol=ceiling(length(ptlist)/2), title = "GO score")
}
print(CodeAndRoll2::pc_TRUE(stress.bool, suffix = "of cells are classified as stressed. Adapt individual score-thresholds if deemed inaccurate."))
return(obj)
}
%%R
adata_no_knn_all_SOBJ <- Shiny.GO.thresh(obj = adata_no_knn_all_SOBJ,
call.shiny.app = FALSE,
stress.ident1 = i1,
stress.ident2 = i2,
notstress.ident3 = i3,
plot.cluster.shiny = "orig.ident")
[1] "16.4 % of cells are classified as stressed. Adapt individual score-thresholds if deemed inaccurate."
%%R -o ScoresDF
ScoresDF <- adata_no_knn_all_SOBJ@meta.data["is.Stressed"]
%%R
(i1 <- Stringendo::kppu(granule.res.4.gruffi, 'cl.av', go1)) (i2 <- Stringendo::kppu(granule.res.4.gruffi, 'cl.av', go2)) (i3 <- Stringendo::kppu(granule.res.4.gruffi, 'cl.av', go3))
adata_no_knn_all_SOBJ <- Shiny.GO.thresh(obj = adata_no_knn_all_SOBJ, stress.ident1 = i1, stress.ident2 = i2, notstress.ident3 = i3, plot.cluster.shiny = "orig.ident")
"Dont forget to click the button in the app: Save New Thresholds"
adata.obs = pd.concat([adata.obs, ScoresDF],axis = 1)
adata.obs["is.Stressed"] = adata.obs["is.Stressed"].replace({0:"not_stressed",1:"stressed"})
sc.pl.umap(adata, color=["is.Stressed"], size = 30, add_outline = True,outline_width=(0.2, 0.05), ncols = 2,frameon=False,
save=nb_fname+".stressLoc.UMAP.pdf")
... storing 'is.Stressed' as categorical
WARNING: saving figure to file figures/umap02_Preprocess_Gruffi.stressLoc.UMAP.pdf
import holoviews as hv
from bokeh.io import export_svgs
obsTupleToMap = ("region","is.Stressed")
SankeyDF=adata.obs[list(obsTupleToMap)]
SankeyDF.columns = ["region","is.Stressed"]
SankeyDF = SankeyDF.groupby(['region','is.Stressed']).size().reset_index().rename(columns={0:'count'})
SankeyDF=SankeyDF[SankeyDF["count"] != 0]
hv.extension('bokeh')
SankeyDF.to_csv(outdir+"/SankeyDF.tsv", sep="\t")
sankey1 = hv.Sankey(SankeyDF, kdims=["region", "is.Stressed"], vdims="count")
sankey1.opts(cmap='Colorblind',label_position='outer',
edge_color='region', edge_line_width=0,
node_alpha=1.0, node_width=40, node_sort=True,
width=1100, height=700, bgcolor="white")
#hv.save(sankey1, './Figures_publication/StressedCells.png', fmt='png')
print(adata)
print(adata[adata.obs["is.Stressed"] == "not_stressed" ] )
adata = adata[adata.obs["is.Stressed"] == "not_stressed" ]
adata.write_h5ad(outdir+"/02B_Preprocess_Gruffi_filtered.h5ad")
AnnData object with n_obs × n_vars = 22220 × 28371
obs: 'dataset', 'organoid', 'region', 'type', 'type_region', 'regionContrast', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'n_genes', 'is.Stressed'
var: 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'highly_variable', 'mean', 'std'
uns: 'dataset_colors', 'diffmap_evals', 'draw_graph', 'neighbors', 'organoid_colors', 'pca', 'regionContrast_colors', 'region_colors', 'type_colors', 'umap', 'is.Stressed_colors'
obsm: 'X_diffmap', 'X_draw_graph_fa', 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
View of AnnData object with n_obs × n_vars = 18822 × 28371
obs: 'dataset', 'organoid', 'region', 'type', 'type_region', 'regionContrast', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'n_genes', 'is.Stressed'
var: 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'highly_variable', 'mean', 'std'
uns: 'dataset_colors', 'diffmap_evals', 'draw_graph', 'neighbors', 'organoid_colors', 'pca', 'regionContrast_colors', 'region_colors', 'type_colors', 'umap', 'is.Stressed_colors'
obsm: 'X_diffmap', 'X_draw_graph_fa', 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
adata.write_h5ad(outdir+"/02B_Preprocess_Gruffi_filtered.h5ad")
adata = sc.read_h5ad(outdir+"/02B_Preprocess_Gruffi_filtered.h5ad")
adata = adata.raw.to_adata().copy()
# Given the epxloratory phase as many genes were retained
VERTICAL_HVGs = {}
for group in adata.obs.type_region.unique():
adata_group = adata[adata.obs.type_region == group].copy()
HVGsDF = sc.pp.highly_variable_genes(adata_group, min_mean=0.0125, max_mean=3, min_disp=0.5, inplace=False, batch_key="organoid")
VERTICAL_HVGs[group] = set(HVGsDF[HVGsDF["highly_variable_nbatches"] == 3].index)
VERTICAL_HVGs = set.union(*list(VERTICAL_HVGs.values()))
extracting highly variable genes
finished (0:00:00)
extracting highly variable genes
finished (0:00:00)
extracting highly variable genes
finished (0:00:00)
extracting highly variable genes
finished (0:00:00)
extracting highly variable genes
finished (0:00:00)
extracting highly variable genes
finished (0:00:01)
len(VERTICAL_HVGs)
3377
import itertools
# Setting up contrasts
proximal = ["polaroid1_proximal","polaroid2_proximal","polaroid3_proximal"]
medial = ["polaroid1_medial","polaroid2_medial","polaroid3_medial"]
distal = ["polaroid1_distal","polaroid2_distal","polaroid3_distal"]
p1 = ["control1_piece1","control2_piece1","control3_piece1"]
p2 = ["control1_piece2","control2_piece2","control3_piece2"]
p3 = ["control1_piece3","control2_piece3","control3_piece3"]
proximal_vs_medial = list(itertools.product(proximal, medial))
medial_vs_distal = list(itertools.product(medial, distal))
p1_vs_p2 = list(itertools.product(p1, p2))
p2_vs_p3 = list(itertools.product(p2, p3))
HORIZONTAL_HVGs = {}
# Proximal vs distal regions
# Proximal vs distal regions
# Proximal vs distal regions
proximal_vs_medial_HVGs = {}
for contrast in proximal_vs_medial:
adataContrast = adata[adata.obs["dataset"].isin(list(contrast))].copy()
print(adataContrast.obs.dataset.value_counts())
sc.pp.highly_variable_genes(adataContrast, min_mean=0.0125, max_mean=3, min_disp=0.5)
HVGs = set(adataContrast.var[adataContrast.var.highly_variable].index.tolist())
proximal_vs_medial_HVGs["_".join(contrast)] = HVGs
proximal_vs_medial_HVGs = pd.Series(list(itertools.chain.from_iterable([list(i) for i in list(proximal_vs_medial_HVGs.values())])))
proximal_vs_medial_HVGs = set(proximal_vs_medial_HVGs.value_counts()[proximal_vs_medial_HVGs.value_counts() == 9].index.tolist())
HORIZONTAL_HVGs["proximal_vs_medial_HVGs"] = proximal_vs_medial_HVGs
# medial vs distal regions
# medial vs distal regions
# medial vs distal regions
medial_vs_distal_HVGs = {}
for contrast in medial_vs_distal:
adataContrast = adata[adata.obs["dataset"].isin(list(contrast))].copy()
print(adataContrast.obs.dataset.value_counts())
sc.pp.highly_variable_genes(adataContrast, min_mean=0.0125, max_mean=3, min_disp=0.5)
HVGs = set(adataContrast.var[adataContrast.var.highly_variable].index.tolist())
medial_vs_distal_HVGs["_".join(contrast)] = HVGs
medial_vs_distal_HVGs = pd.Series(list(itertools.chain.from_iterable([list(i) for i in list(medial_vs_distal_HVGs.values())])))
medial_vs_distal_HVGs = set(medial_vs_distal_HVGs.value_counts()[medial_vs_distal_HVGs.value_counts() == 9].index.tolist())
HORIZONTAL_HVGs["medial_vs_distal_HVGs"] = medial_vs_distal_HVGs
# P1 vs P2
# P1 vs P2
# P1 vs P2
p1_vs_p2_HVGs = {}
for contrast in p1_vs_p2:
adataContrast = adata[adata.obs["dataset"].isin(list(contrast))].copy()
print(adataContrast.obs.dataset.value_counts())
sc.pp.highly_variable_genes(adataContrast, min_mean=0.0125, max_mean=3, min_disp=0.5)
HVGs = set(adataContrast.var[adataContrast.var.highly_variable].index.tolist())
p1_vs_p2_HVGs["_".join(contrast)] = HVGs
p1_vs_p2_HVGs = pd.Series(list(itertools.chain.from_iterable([list(i) for i in list(p1_vs_p2_HVGs.values())])))
p1_vs_p2_HVGs = set(p1_vs_p2_HVGs.value_counts()[p1_vs_p2_HVGs.value_counts() == 9].index.tolist())
HORIZONTAL_HVGs["p1_vs_p2_HVGs"] = p1_vs_p2_HVGs
# P2 vs P3
# P2 vs P3
# P2 vs P3
p2_vs_p3_HVGs = {}
for contrast in p2_vs_p3:
adataContrast = adata[adata.obs["dataset"].isin(list(contrast))].copy()
print(adataContrast.obs.dataset.value_counts())
sc.pp.highly_variable_genes(adataContrast, min_mean=0.0125, max_mean=3, min_disp=0.5)
HVGs = set(adataContrast.var[adataContrast.var.highly_variable].index.tolist())
p2_vs_p3_HVGs["_".join(contrast)] = HVGs
p2_vs_p3_HVGs = pd.Series(list(itertools.chain.from_iterable([list(i) for i in list(p2_vs_p3_HVGs.values())])))
p2_vs_p3_HVGs = set(p2_vs_p3_HVGs.value_counts()[p2_vs_p3_HVGs.value_counts() == 9].index.tolist())
HORIZONTAL_HVGs["p2_vs_p3_HVGs"] = p2_vs_p3_HVGs
polaroid1_proximal 2076
polaroid1_medial 1733
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid1_proximal 2076
polaroid2_medial 670
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid1_proximal 2076
polaroid3_medial 616
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid2_proximal 2097
polaroid1_medial 1733
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid2_proximal 2097
polaroid2_medial 670
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid2_proximal 2097
polaroid3_medial 616
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid3_proximal 1920
polaroid1_medial 1733
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid3_proximal 1920
polaroid2_medial 670
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid3_proximal 1920
polaroid3_medial 616
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid1_medial 1733
polaroid1_distal 1036
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid1_medial 1733
polaroid2_distal 255
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid1_medial 1733
polaroid3_distal 225
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid1_distal 1036
polaroid2_medial 670
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid2_medial 670
polaroid2_distal 255
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid2_medial 670
polaroid3_distal 225
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid1_distal 1036
polaroid3_medial 616
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid3_medial 616
polaroid2_distal 255
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
polaroid3_medial 616
polaroid3_distal 225
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control1_piece2 468
control1_piece1 299
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control2_piece2 1224
control1_piece1 299
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control3_piece2 1200
control1_piece1 299
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control2_piece1 1069
control1_piece2 468
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control2_piece2 1224
control2_piece1 1069
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control3_piece2 1200
control2_piece1 1069
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control3_piece1 1016
control1_piece2 468
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control2_piece2 1224
control3_piece1 1016
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control3_piece2 1200
control3_piece1 1016
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control1_piece3 484
control1_piece2 468
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control2_piece3 997
control1_piece2 468
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control3_piece3 1437
control1_piece2 468
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control2_piece2 1224
control1_piece3 484
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control2_piece2 1224
control2_piece3 997
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control3_piece3 1437
control2_piece2 1224
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control3_piece2 1200
control1_piece3 484
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control3_piece2 1200
control2_piece3 997
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
control3_piece3 1437
control3_piece2 1200
Name: dataset, dtype: int64
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
HORIZONTAL_HVGs = set.union(*list(HORIZONTAL_HVGs.values()))
JointHVGs = list(HORIZONTAL_HVGs.union(VERTICAL_HVGs))
adata.var["highly_variable"] = adata.var_names.isin(JointHVGs)
adata.raw = adata.copy()
sc.pp.scale(adata, max_value=20)
... as `zero_center=True`, sparse input is densified and may lead to large memory consumption
sc.tl.pca(adata, svd_solver='arpack', use_highly_variable=True)
computing PCA
on highly variable genes
with n_comps=50
finished (0:00:05)
sc.pl.pca(adata, color=["organoid","type","dataset",], size = 30, add_outline = True,outline_width=(0.2, 0.05))
sc.pl.pca_variance_ratio(adata, log=True)
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=20)
computing neighbors
using 'X_pca' with n_pcs = 20
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
sc.tl.umap(adata)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:00:11)
sc.tl.diffmap(adata)
computing Diffusion Maps using n_comps=15(=n_dcs)
computing transitions
finished (0:00:00)
eigenvalues of transition matrix
[1. 0.9980236 0.99751395 0.99457854 0.9930555 0.9916404
0.98803663 0.9859242 0.9842043 0.98254275 0.981802 0.9785987
0.97799367 0.9754154 0.9751553 ]
finished: added
'X_diffmap', diffmap coordinates (adata.obsm)
'diffmap_evals', eigenvalues of transition matrix (adata.uns) (0:00:00)
sc.pl.umap(adata, color=["organoid","type","dataset","MKI67"], size = 30, add_outline = True,outline_width=(0.2, 0.05))
sc.pl.diffmap(adata, color=["organoid","type","dataset",], size = 30, add_outline = True,outline_width=(0.2, 0.05))
sc.tl.draw_graph(adata, n_jobs=4)
drawing single-cell graph using layout 'fa'
finished: added
'X_draw_graph_fa', graph_drawing coordinates (adata.obsm) (0:02:17)
sc.pl.draw_graph(adata, color=["organoid","type","dataset","MKI67"], size = 30, add_outline = True,outline_width=(0.2, 0.05))
adata.write_h5ad(outdir+"/02C_Preprocess_final.h5ad")